\[%% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \newcommand{\given}{\;\vert\;} \]

t-SNE

http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf
http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf

t-SNE is a recently population dimension reduction method.

It makes very nice visualizations.

Simulate data

Let’s first make some data. This will be some points distributed around a ellipse in \(n\) dimensions.

n <- 20
npts <- 1e3
xy <- matrix(rnorm(n*npts), ncol=n)
theta <- runif(npts) * 2 * pi
ab <- matrix(rnorm(2*n), nrow=2)
ab[,2] <- ab[,2] - ab[,1] * sum(ab[,1] * ab[,2]) / sqrt(sum(ab[,1]^2))
ab <- sweep(ab, 1, sqrt(rowSums(ab^2)), "/")
for (k in 1:npts) {
    dxy <- 4 * c(cos(theta[k]), sin(theta[k])) %*% ab
    xy[k,] <- xy[k,] + dxy
}

Here’s what the data look like:

pairs(xy, pch=20)

plot of chunk plot_data

But there is hidden, two-dimensional structure:

plot(xy %*% t(ab), xlab='dimension 1', ylab='dimension 2', 
     pch=20, col=rainbow(nrow(xy))[rank(theta)])

plot of chunk plot_ring

Now let’s build the distance matrix.

dmat <- as.matrix(dist(xy)) 

Implementation

The quantity to be minimized is Kullback-Leibler distance between \(p\) and \(q\), defined as \[\begin{aligned} \text{KL}(p|q) &= \sum_x p_x \log(p_x/q_x) . \end{aligned}\]

Here is a Stan block.

tsne_block <- '
data {
    int N; // number of points
    int n; // input dimension
    int k; // output dimension
    matrix[N,N] dsq;  // distance matrix, squared
}
parameters {
    real<lower=0> sigma_sq; // in kernel for p 
    matrix[N,k] y;
}
model {
    matrix[N,N] q;
    real dt;
    matrix[N,N] p;
    q[N,N] = 1.0;
    for (i in 1:(N-1)) {
        q[i,i] = 1.0;
        for (j in (i+1):N) {
            q[i,j] = 1 / (1 + squared_distance(y[i], y[j]));
            q[j,i] = q[i,j];
        }
    }
    for (i in 1:N) {
        q[i] = q[i] / (sum(q[i]) - q[i,i]);
    }
    // create p matrix
    p = exp(-dsq/(2*sigma_sq));
    for (i in 1:N) {
        p[i] = p[i] / (sum(p[i]) - p[i,i]);
    }
    // compute the target
    for (i in 1:N) {
        dt = (-1) * sum(p[i] .* log(p[i] ./ q[i]));
        target += dt;
    }
    sigma_sq ~ normal(0, 10);
}'


tk <- 2
tsne_model <- stan_model(model_code=tsne_block)
runtime <- system.time(tsne_optim <- optimizing(tsne_model,
                                 data=list(N=nrow(xy),
                                           n=ncol(xy),
                                           k=tk,
                                           dsq=(dmat/max(dmat))^2)))
## Initial log joint probability = -260.345
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
runtime
##    user  system elapsed 
##  22.040   0.068  22.148

It works!

out_y <- do.call(cbind, lapply(1:tk, function (k) {
                                   tsne_optim$par[sprintf("y[%d,%d]", 1:nrow(xy), k)]
                           } ) )
layout(t(1:2))
plot(out_y, xlab='t-sne 1', ylab='t-sne 2', main="t-SNE",
     pch=20, col=rainbow(nrow(xy))[rank(theta)])
plot(xy %*% t(ab), xlab='input 1', ylab='input 2', main="'truth'",
     pch=20, col=rainbow(nrow(xy))[rank(theta)])

plot of chunk plot_tsne

Another case

This time let’s apply the method to a high-dimensional random walk.

n <- 40
npts <- 1e2
rw <- colCumsums(matrix(rnorm(n*npts), ncol=n))

Here’s what the data look like:

pairs(rw, col=adjustcolor(rainbow(npts), 0.5), pch=20)

plot of chunk plot_rw

Now let’s build the distance matrix.

rw_dmat <- as.matrix(dist(rw)) 
rw_runtime <- system.time(rw_tsne_optim <- optimizing(tsne_model,
                                 data=list(N=nrow(rw),
                                           n=ncol(rw),
                                           k=tk,
                                           dsq=(rw_dmat/max(rw_dmat))^2)))
## Initial log joint probability = -39.0414
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
rw_runtime
##    user  system elapsed 
##   0.301   0.000   0.302

It works!

rw_y <- do.call(cbind, lapply(1:tk, function (k) {
                                   rw_tsne_optim$par[sprintf("y[%d,%d]", 1:nrow(xy), k)]
                           } ) )
plot(rw_y, xlab='t-sne 1', ylab='t-sne 2', main="t-SNE",
     pch=20, col=rainbow(nrow(rw)))

plot of chunk plot_rw_tsne